
**************************************************************************************************************************************************
******************* Code for 'Doing Less with Less: Capital Misallocation, investment and the prodcutivty slowdown in Australia *******************
***************************************************** Code for Data manipulation larger sample ******************************************************************

**************************************************DATE: AUG 2022 ******************************************************************************
**************************************************** AUTHOR: Jonathan Hambur *******************************************************************
*********************************************************************************************************************************************

*** Code strcuture
*00. Preliminaries and globals
*01. Intiail data clean - same as mark-up estiamtion
*02 Merge outside data
*03 Variable constrcution

**** Imported data series
* m_DLW_tl_m_w_y estiamted firm mark-up. TL indicates translog (blacnk CD), M indicates based internediate input elasticity. W and y indicae used wages and year FE in first stage, respectively. See Hambur (2021)
* prod_DLW_tl_w_y log MFP. From above estiamtion
* K bit


**00. Preliminaries and globals


*** Define output defaltors and ther variables
local go_defl go_ipd_02 // gross output

local int_defl int_ipd_02 // int inputs

local va_defl va_ipd_02 // direct VA

local k_defl K_defl // capital stock

local inv_defl GFCF_defl // inv deflator

local wage_meas labour // measure of labour income

local output income // output measure - income of va

local input variable2 // input measure

local labour fte // measure of employment

local cuts 1 // remove those employing less than this

local ind industry

** Current Current (0) or lagged book value of K stock, or perp inv est (2)
local k_meas = 1 // preffered metric is lagged, capturing K stock going into period

** Single or double deflation of VA (with BIT measure must single)
local s_or_d_defl s // deflate va directly, or based on int inputs and gross output sepereately


* Define the instrument sets
local k_instr k
local k_instr_tl k k2 l1k1_instr m1k1_instr

**** Trim extreme growth? no (0), if large share of industry change (1), if in outlier in distry (2), based multiprod rules
local trim = 0



***********************************************01. Intiail varaible constrcution
** Load in data 

use "$data\full_prod_bit.dta", clear 


xtset firmid fyear
** smaller industry metrics
g ind_02 = floor(industry/100)
g ind_03 = floor(industry/10)

* based on above deflate and make K measures
g def_y = `go_defl'

g def_yva = `va_defl'

g def_in =`int_defl'

if `k_meas' == 0 {
g K_nom = k_bit
g def_k = k_defl
g K = K_nom/def_k
}

if `k_meas' == 1 {
g K_nom = l.k_bit
g def_k = l.k_defl
g K = K_nom/def_k
}
* Nominal

g wagebill = `wage_meas'
g Y_nom = `output'


	g Yva_nom = (`output'-`input')


g M_nom = `input'

* real
g Y = Y_nom/def_y
g y = log(Y)
g M = M_nom/def_in

if "`s_or_d_defl'" == "s" {
	g YVA = Yva_nom/def_yva
}

if "`s_or_d_defl'" == "d" {
	g YVA = Y-M
}
	
	** logged	
g yva = log(YVA)
g k = log(K)

g L = `labour'
g l = log(L)

g m = log(M) 

if "`labour'" != "fte_adj" {
g W_nom = wagebill/L
g W = W_nom/def_y
g w = log(W)
}

if "`labour'" == "fte_adj" {
g W_nom = wagebill/(L-1)
g W = W_nom/def_y
g w = log(W)
}



** remove really small employment

drop if L<`cuts'


*---------------creating variables used in MFP est--------------------------------------------------------------------*
* higher order terms on inputs for translog


g l_lag=l.l
g l2 =l^2
g l2_lag = l.l2

g l_2lag = l2.l

g l1m1 = l*m
g l1m1_lag = l.l1m1 

g k_lag = l.k
g k2= k^2
g k2_lag = l.k2

g l1k1 = l*k
g l1k1_lag = l.l1k1
g l1k1_instr = l_lag*k


g m_lag= l.m
g m2 = m^2
g m2_lag = m2^2

g m1k1 = m*k
g m1k1_lag = l.m1k1
g m1k1_instr = l.m*k

g w_lag = l.w

******* Labour productivity measures ***************
g l_prod_fte = Y/fte
g l_prod_hcnt = Y/hcnt
g l_prod_fte_adj = Y/fte_adj


**** Make a simple Lerner index
g lerner = (Y_nom-variable1)/Y_nom
g lerner_depr = (Y_nom-variable1-deprexps)/Y_nom
g profit = (Y_nom-totlexps)/Y_nom

*These firms won't make it into the sample, and negative VA can 
drop if y == .
drop if l == .
drop if k == .
*drop if m == .

 g ly=ln(Y_nom)

g dlgo = d.l.ly

preserve
keep id fyear dlgo
save "$data\addit_sales_grow.dta", replace
restore


*** Dataset for Markup estiamtin kept skinny, so bring in some extra varaibles here, as well as some industry metrics
merge 1:1 fyear id using "$outputs\frame.dta", keepusing(x_state) keep(master match)
drop _merge

** State unemployment data from ABS LFS  - csv included in sup material
rename x_state State
merge m:1 fyear State using "$data\Unemp.dta",  keep(master match)
drop _merge

** If need extra bal sheet data not used can bring in here. Taken from BIT - not used
*merge 1:1 fyear id using "$outputs\extra_bs.dta", keepusing(ncl ncl_ currliab totlliab) keep(master match)
*drop _merge
** If need extra age data not used can bring in here. Taken from birthdate dataset
merge m:1 tsid id using "P:\2019 update\Raw data\birth.dta", keep(master match)
drop _merge

** Rajan ad zingales financial dependence a intangibles metrics - included as csv in supplementary mateiral
merge m:1 industry using "P:\mANIP DATA\rajan_zing.dta", keep(master match)
drop _merge

** PIM measures

merge 1:1 fyear id using "$outputs\pim.dta", keepusing(rk_pim* inv r_inv) keep(master match)
drop _merge

** Industry mark-up measures
** Levels
merge m:1 fyear industry using "$outputs\sum_k_`k_meas'_instr_`k_instr'_out_`output'_int_`input'_trm_`trim'_`ind'_`file'_l`cuts'.dta", keepusing(mi_`reg' wig_`reg')
drop _merge


compress

********************* 03 Varaible constrcution

** MArket shares
egen sales_tot = sum(Y_nom), by(fyear)
egen sales_ind = sum(Y_nom), by(fyear industry)
 
g tot_share = Y_nom/sales_tot
g ind_share = Y_nom/sales_ind


 *replace fixed2 = totlexps -variable2-labour
 *g fixed3 = fixed2-deprexps
 g lfixed2 = ln(fixed2)
 g lfixed3 = ln(fixed3)
 
 g fixed_share2 = (fixed2/Y_nom)
 g fixed_share3 = (fixed3/Y_nom)
 
 * investment ratios
  
    g k_sh = K_nom/Y_nom
 egen k_sh6 = pctile(k_sh), p(99)

  egen ind_yr = group(industry fyear)

g i_y = (inv/Y_nom)/1000000
replace i_y=1 if i_y>1 & i_y!=.
*****  Prodcutivity measrures

** labour

 *va 
 g l_prod_fte_va = YVA/fte
 g ll_prod_fte_va = ln(l_prod_fte_va)
  * gross output
  g ll_prod_fte = ln(l_prod_fte)
  
 ** Demean
bysort industry fyear: egen prod_va_mean = mean(l_prod_fte_va)
bysort industry fyear: egen prod_mean = mean(l_prod_fte)

g dev_prod_lva = log(l_prod_fte_va/prod_va_mean)
g dev_prod_l = log(l_prod_fte/prod_mean)

** Get extremes

egen dev_prod_lva1 = pctile(dev_prod_lva), p(1)
egen dev_prod_lva99 = pctile(dev_prod_lva), p(99)

egen dev_prod_l1 = pctile(dev_prod_l), p(1)
egen dev_prod_l99 = pctile(dev_prod_l), p(99)
	

**** Capital prod
g va_k = YVA/K
g lva_k = log(va_k)
* deviation
bysort industry fyear: egen prod_kva_mean = mean(va_k)
g dev_prod_kva = log(va_k/prod_kva_mean)
* outliers
egen dev_prod_kva1 = pctile(dev_prod_kva), p(1)
egen dev_prod_kva99 = pctile(dev_prod_kva), p(99)

  
******** Construct LHS variables	
compress

xtset firmid fyear
	
 g dfte = (fte-l.fte)/(0.5*fte+0.5*l.fte) if fte>0 & fte !=. & l.fte>0 & l.fte !=.
 *replace dfte = 2 if l.fte ==. & fte > 0 & fyear !=2002 // check if want to have entry or exit in here?
 *replace dfte = 2 if l.fte ==0 & fte > 0
  ** Bounded PIM growth
 ds rk_pim*
 local varlist `r(varlist)'
 foreach var in `varlist' {
	g d_`var'_bound = (`var'-l.`var')/(0.5*`var'+0.5*l.`var') if `var'>0 & `var'!=. & l.`var'>0 & l.`var'!=.	
 }
	
** Bounded capital stock growth - note timing  is equated to Decker t al, where catial at end previous year is relevant metric
	
g d_k_bound = (K-l.K)/(0.5*K+0.5*l.K) if K>0 & K!=. & l.K>0 & l.K!=.

	
** Gros output
g K_contemp = k_bit/k_defl // note here using K  at end current year
g i_k = (r_inv/l.K_contemp)/1000000 // r_inv not scalled so scaling to $m
replace i_k = 2.6 if i_k>2.6 & i_k!=. //windsorise a bit above the 99th percentile
g l_i_k = l.i_k

g i_y = (inv/Y_nom)/1000000
replace i_y=1 if i_y>1 &  i_y!=.


************** Size and age
g size = 1 if fte<5
replace size = 2 if (fte>=5 & fte<20)
replace size = 3 if (fte>=20 & fte<200)
replace size = 4 if (fte>=200 & fte<500)
replace size = 5 if (fte>=500 & fte!=.)

g age = tsid-yob
tab age if tsid == 2005
g young = (age<=5) // note issues with age with lots of firms 'born' in 2001 due to introduction GST. So young varaible baised pre 2005
replace young = . if age==.


********************************* Balance sheet metrics
g gearing =totlasst/totlliab*1000000 // Defined as assets/ liab. Scalling as total assets had scaled down by $m
replace gearing =  10 if gearing>10 & gearing!=. // topcode at 10
replace gearing = 10 if totlasst>0 & totlliab==0


foreach var in gearing  {
	bysort industry fyear: egen p50_`var' = pctile(`var'), p(50)
	bysort industry fyear: egen p75_`var' = pctile(`var'), p(75)
	bysort industry fyear: egen p25_`var' = pctile(`var'), p(25)
	bysort industry fyear: egen p10_`var' = pctile(`var'), p(10)
	bysort industry fyear: egen p90_`var' = pctile(`var'), p(90)
	
}

** Groupings for high low gearing
g gear_50 = (gearing<p50_gearing)
g gear_25 = (gearing<p25_gearing)
g gear_10 = (gearing<p10_gearing)

*** Cash flow measures
g cf = Y_nom-totlexps+deprexps
g cf_p = (cf>0) // cashflow positive indicator
replace cf_p = . if cf==.

compress


******* Deamean some of the industry or state metrics so focus on changes
** demean markups
bysort industry: egen mean_mi_$reg = mean(mi_$reg)
g demean_mi_$reg = mi_$reg - mean_mi_$reg

bysort industry: egen mean_wig_$reg = mean(wig_$reg)
g demean_wig_$reg = wig_$reg - mean_wig_$reg


** Unemployment
egen mean_unemp_nat = mean(Unemp_nat)
g demand_unemp_nat = Unemp_nat - mean_unemp_nat

bysort State: egen mean_unemp_state = mean(Unemp_state)
g demean_unemp_state = Unemp_state - mean_unemp_state



******** Period varaibles

***** splitting sample into 3 periods
** Main deifnition
	g period1 = 1 if tsid<2008
	replace period1 = 2 if tsid>=2008 & tsid<2012
	replace period1 = 3 if tsid>=2012

** Alternate test difnition 
	g period2 = 1 if tsid<2008
	replace period2 = 2 if tsid>=2008 & tsid<2011
	replace period2 = 3 if tsid>=2011

** Trend as in Andrews and Hansell. Not preffered as parametric restriction
	g trend = tsid-2006
	g trend2 = trend^2
	
	g trend_05 = tsid-2005
	g trend2_05 = trend_05^2

	
**** Defining digital intensity based on Calvino et al OECD paper
	** Digital intensity
	g intense = 1 if  industry < 1300
replace intense = 2 if industry > 1300 & industry < 1400
replace intense = 3 if industry > 1400 & industry < 1700
replace intense = 2 if industry > 1700 & industry < 2300
replace intense = 4 if industry > 2300 & industry < 2400
replace intense = 3 if industry > 2400 & industry < 2599
replace intense = 2 if industry ==2599 
replace intense = 1 if division == "D" | division == "E" | division == "H" | division == "I"
replace intense = 3 if division == "G" | division == "F" | division == "R" | division == "S"
replace intense = 3 if industry > 2500 & industry < 5800
replace intense = 4 if industry > 5800 & industry < 6200
replace intense = 2 if industry == 6010
replace intense = 4 if industry > 6600 & industry < 6700
replace intense = 1 if industry > 6700 & industry < 6900
replace intense = 4 if industry > 6900 & industry < 7500

save "$outputs\reg_file_large.dta", replace